library(text)

train_path <- "/Users/yuwang/Desktop/PMethods/Regression-Anxiety/train_dev.csv"
test_path <- "/Users/yuwang/Desktop/PMethods/Regression-Anxiety/test.csv"

# Read the CSV file into a data frame
train <- read.csv(train_path)
test <- read.csv(test_path)

# Record start time
start_time <- Sys.time()
train_word_embeddings <- textEmbed(
  train,
  model = "bert-base-uncased", aggregation_from_layers_to_tokens = "concatenate", aggregation_from_tokens_to_texts = "mean", keep_token_embeddings = FALSE
)
end_time <- Sys.time()
# Calculate duration
duration <- end_time - start_time
# Print the duration
cat("Execution duration for train embedding:", duration, "\n")

# Completed layers output for text_long (variable: 1/1, duration: 22.610270 mins).
# Completed layers aggregation for word_type_embeddings. 
# Completed layers aggregation (variable 1/1, duration: 1.486035 hours).
# > end_time <- Sys.time()
# > # Calculate duration
#   > duration <- end_time - start_time
# > # Print the duration
#   > cat("Execution duration for train embedding:", duration, "\n")
# Execution duration for train embedding: 1.959194 

saveRDS(train_word_embeddings, "/Users/yuwang/Desktop/PMethods/Regression-Anxiety/train_word_embeddings.rds")


# Record start time
start_time <- Sys.time()
test_word_embeddings <- textEmbed(
  test,
  model = "bert-base-uncased", aggregation_from_layers_to_tokens = "concatenate", aggregation_from_tokens_to_texts = "mean", keep_token_embeddings = FALSE
)
end_time <- Sys.time()
# Calculate duration
duration <- end_time - start_time
# Print the duration
cat("Execution duration for test embedding:", duration, "\n")


# Completed layers output for text_long (variable: 1/1, duration: 4.208215 mins).
# Completed layers aggregation for word_type_embeddings. 
# Completed layers aggregation (variable 1/1, duration: 8.674991 mins).
# > end_time <- Sys.time()
# > # Calculate duration
#   > duration <- end_time - start_time
# > # Print the duration
#   > cat("Execution duration for test embedding:", duration, "\n")
# Execution duration for test embedding: 12.96107 

saveRDS(test_word_embeddings, "/Users/yuwang/Desktop/PMethods/Regression-Anxiety/test_word_embeddings.rds")

train_word_embeddings = readRDS("/Users/yuwang/Desktop/PMethods/Regression-Anxiety/train_word_embeddings.rds")
test_word_embeddings = readRDS("/Users/yuwang/Desktop/PMethods/Regression-Anxiety/test_word_embeddings.rds")

for (i in c(500, 1000, 2000)){
  # Record start time
  start_time <- Sys.time()
  n_folds = 10
  if (i == 1000) {
    n_folds <- 5
  }
  model <- textTrain(
    x = train_word_embeddings$texts$text_long[1:i,], # the predictor variables (i.e., the word embeddings)
    y = train$anxiety[1:i], # the criterion variable (i.e., the rating scale score.)
    force_train_method = "regression",
    outside_folds = 5
  )
  end_time <- Sys.time()
  # Calculate duration
  duration <- end_time - start_time
  # Print the duration
  cat("Training size:", i, "\n")
  print(duration)
  cat("Execution duration for training:", duration, "\n")
  
  predictions <- textPredict(model, word_embeddings = test_word_embeddings$texts)
  
  # Calculate Pearsonr
  correlation <- cor(predictions$text_long__ypred, test$anxiety, method = "pearson")
  cat("Correlation", correlation, "\n")
  # Calculate RMSE
  errors <- predictions$text_long__ypred - test$anxiety
  squared_errors <- errors^2
  mean_squared_error <- mean(squared_errors)
  rmse <- sqrt(mean_squared_error)
  cat("rmse", rmse, "\n")
}